setwd("~/analyses")
main_path <- getwd() 
path <- file.path(main_path, "otutables_processing")
library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggpmisc)
library(cowplot)

Supplementary Figures

tab_name <- file.path(main_path,"Table_richness_calculations_long.txt")
total_richness_df2 <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)
total_richness_df2$Level <- 
 factor(total_richness_df2$Level,levels = c("99/100", "98.5", "98", "97", 
                                            "96", "95"))
total_richness_df2$Method <- 
 factor(total_richness_df2$Method,levels = c("VSEARCH", "SWARM", "DADA2(+VS)", 
                                             "CROP"))
total_richness_df2$Curated <- 
 factor(total_richness_df2$Curated,levels = c("raw", "curated"))

formula <- y ~ x

#Plot full x/y plots
ggplot(total_richness_df2, aes(x= Obs, y= OTU, color = Curated)) +
  geom_point(pch=21,size=1, alpha = 0.8) +
  geom_abline(intercept = 0, linetype =2) +
  facet_grid(Method ~Level) +
  xlab("Plant richness") +
  ylab("OTU richness") +
  geom_smooth(method = "lm", se = F) +
  stat_poly_eq(geom = "label", 
               alpha = 0.5,aes(label = paste(..eq.label.., 
                                             ..rr.label.., sep = "~~~")), 
               formula = formula, label.x.npc = "left", 
               label.y.npc = "top", parse = TRUE, size = 0.9, label.size = NA) +
  scale_color_brewer(palette = "Set1") + theme_bw() + 
  theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 1. Linear regression of OTU richness vs. plant richness.
OTU richness (number of OTUs in each soil sample from the 130 sites) is plotted on the y-axis. Plant richness (number of plant species observed in each of the 130 40m x 40m sites) is plotted on the x-axis. The dashed line is an identity to evaluate whether the OTU count overestimates (to the left of the line) or underestimates (to the right) the plant richness. The linear model for each regression is shown along with the corresponding coefficient of determination (R2). Red points represent the pre-curation methods, and blue point represent post-curation results. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). VSEARCH and SWARM initially overestimate the richness significantly at most clustering levels. CROP and DADA2 (+/- clustering) perform better, but all measures of correspondence (R2, slope and intersect) are improved with curation for all methods. The curation effect on CROP is low.

\pagebreak

ggplot(total_richness_df2, aes(x= Obs, y= OTU, col = Curated)) +
  geom_point(pch=21,size=1, alpha = 0.8) +
  coord_cartesian(ylim = c(0, 200)) +
  geom_abline(intercept = 0, linetype =2) +
  facet_grid(Method ~Level) +
  xlab("Plant richness") +
  ylab("OTU richness") +
  geom_smooth(method = "lm", se = F) +
  stat_poly_eq(geom = "label", 
               alpha = 0.7,aes(label = paste(..eq.label.., 
                                             ..rr.label.., sep = "~~~")), 
               formula = formula, label.x.npc = "left", 
               label.y.npc = 0.2, label.size = NA, parse = TRUE, size = 0.9) +
  scale_color_brewer(palette = "Set1") + 
  theme_bw() + 
  theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 2. Linear regression of OTU richness vs. plant richness.
OTU richness (number of OTUs in each soil sample from the 130 sites) is plotted on the y-axis. Plant richness (number of plant species observed in each of the 130 40m x 40m sites) is plotted on the x-axis. The dashed line is an identity to evaluate whether the OTU count overestimates (to the left of the line) or underestimates (to the right) the plant richness. The linear model for each regression is shown along with the corresponding coefficient of determination (R2). Red points represent the pre-curation methods, and blue point represent post-curation results. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). VSEARCH and SWARM initially overestimate the richness significantly at most clustering levels. CROP and DADA2 (+/- clustering) perform better, but all measures of correspondence (R2, slope and intersect) are improved with curation for all methods. The curation effect on CROP is low. Identical to Supplementary Figure 1, but shows a truncated y-axis for better illustrattion of the correlations, by excluding the extreme values.

\pagebreak

tab_name <- file.path(main_path,"Table_method_statistics.txt")
method_statistics <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)
method_statistics$Level <- 
 factor(method_statistics$Level,levels = c("99/100", "98.5", "98", "97", 
                                           "96", "95"))
method_statistics$Method <- 
 factor(method_statistics$Method,levels = c("VSEARCH", "SWARM", "DADA2(+VS)", 
                                            "CROP"))
method_statistics$Curated <- 
 factor(method_statistics$Curated,levels = c("raw", "curated"))
ggplot(method_statistics,aes(x=Curated,weights=OTU_count,fill=Curated)) +
  geom_bar(position="dodge") + 
  geom_hline(yintercept = 564,linetype = 2) +
  facet_grid(Method ~Level) +
  xlab("") +
  ylab("OTUs") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw()+
  theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 3. Total number of OTUs vs total number of plant species recorded.
Total method level OTU richness is plotted on the y-axis. Red bars represent OTUs of the un-curated methods and blue bars represent OTUs of the curated methods. The dashed line indicates the total number of species (564) observed in the study for comparison. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). All methods (except CROP) initially identify many more OTUs than can realistically be expected from the inventories, but all measures of total richness are reduced to realistic levels with curation for all methods. The curation effect on CROP is low.

\pagebreak

ggplot(method_statistics,aes(x=Curated,weights=OTU_count,fill=Curated)) +
  geom_bar(position="dodge") + 
  geom_hline(yintercept = 564,linetype = 2) +
  facet_grid(Method ~Level,scale = "free_y") +
  xlab("") +
  ylab("OTUs") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw()+
  theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 4. Total number of OTUs vs total number of plant species recorded.
Total method level OTU richness is plotted on the y-axis. Red bars represent OTUs of the un-curated methods and blue bars represent OTUs of the curated methods. The dashed line indicates the total number of species (564) observed in the study for comparison. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). All methods (except CROP) initially identify many more OTUs than can realistically be expected from the inventories, but all measures of total richness are reduced to realistic levels with curation for all methods. The curation effect on CROP is low. Identical to Supplementary Figure 3, but with a flexible y-axis for better comparison of the low richness CROP method.

\pagebreak

ggplot(method_statistics,aes(x=Curated,weights=Redundancy,fill=Curated)) +
  geom_bar(position="dodge") + 
  facet_grid(Method ~Level) +
  xlab("") +
  ylab("Taxonomic redundancy (percentage of OTUs with redundant 
       taxonomic annotation)") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw()+
  scale_y_continuous(labels = scales::percent) + 
  theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 5. Taxonomic redundancy of each method.
Taxonomic redundancy (the proportion of OTUs with a redundant taxonomic assignment) is plotted on the y-axis. Red bars represent the redundancy of the taxonomic assignment from the un-curated methods and blue bars represent that of the curated methods. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). All methods initially have a high taxonomic redundancy – even CROP, which contains very few OTUs – but taxonomic redundancy is reduced to realistic levels with curation for all methods.

\pagebreak

#Get beta diversity of plant survey
tab_name <- file.path(main_path,"Table_plants_2014_cleaned.txt")
Plant_data2014 <- read.table(tab_name, sep="\t", row.names = 1, 
                             header=TRUE, as.is=TRUE)
Plant_richness <- colSums(Plant_data2014)
obs_beta <- nrow(Plant_data2014)/mean(Plant_richness)

ggplot(method_statistics,aes(x=Curated,weights=Beta,fill=Curated)) +
  geom_bar(position="dodge") + 
  geom_hline(yintercept = obs_beta,linetype = 2) +
  facet_grid(Method ~Level) +
  xlab("") +
  ylab("Betadiversity (total richness / avg. plot richness)") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw()+ theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 6. Betadiversity of each method.
Betadiversity (calculated as total number of OTUs divided by the mean number of OTUs in the 130 sites) is plotted on the y-axis. Red bars represent the betadiversity of the taxonomic assignment from the un-curated methods and blue bars represent that of the curated methods. The dashed line indicates the betadiversity of the plant data (17.23) observed in the study for comparison, calculated in the same way. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). All methods initially have a higher betadiversity than can be expected from observational plant data – even CROP, which contain very few OTU. The betadiversity is reduced to realistic levels with curation for all methods.

\pagebreak

tab_name <- file.path(main_path,"Table_OTU_match_rates.txt")
pident_frame <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)

pident_frame$level_pident <- 
 factor(pident_frame$level_pident,levels = c("99/100", "98.5", "98", "97", 
                                             "96", "95"))
pident_frame$method_pident <- 
 factor(pident_frame$method_pident,levels = c("VSEARCH", "SWARM", "DADA2(+VS)", 
                                              "CROP"))

#Violin plot of distribution of best matches, all methods
ggplot(pident_frame, aes(x=retained_or_discarded, 
                         y=pident,fill=retained_or_discarded)) +
  geom_violin() +
  facet_grid(method_pident~level_pident)+
  xlab("number of OTUs") +
  ylab("Best match on GenBank") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw()+
  theme(text = element_text(size=8)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 7. Distribution of best matches for OTUs on GenBank.
Density distribution of the best reference database match for all OTUs (percent identity (%) of best matching reference sequence on GenBank) is plotted as a violin plot. Red bars represent OTUs discarded by the LULU algorithm, and blue bars represent OTUs retained/curated by the algorithm. The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%). For all methods the curation of LULU discards primarily OTUs with best matches of less than 99-100%, and retains predominantly perfectly or near-perfectly matching sequences. The CROP method contains a much higher proportion of OTUs with low matches, even after curation.



tobiasgf/lulu documentation built on Jan. 17, 2024, 3:57 p.m.